{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Loading the train fle\n",
    "train=pd.read_csv('C:/Users/chandu prakash/Downloads/Trainweek3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Loading the test file\n",
    "test=pd.read_csv('C:/Users/chandu prakash/Downloads/Testweek3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0, 1], dtype=int64), array([579, 368], dtype=int64))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Checking if the labels are properly distribuited or not \n",
    "np.unique(train.IsGoodNews,return_counts=True)\n",
    "#I could see  it is not properly distribuited,so i use randomOversampling technique "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Separating Predictors and Target variable\n",
    "X=train.drop(['IsGoodNews'],axis=1)\n",
    "Y=train['IsGoodNews']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "#Importing the imblearn package\n",
    "from imblearn.over_sampling import RandomOverSampler\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Balancing the labels with rato 1 equally\n",
    "os =RandomOverSampler(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Applying the technique\n",
    "X_train_reshape, y_train_reshape = os.fit_sample(X, Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# splitting X and y into training and testing sets \n",
    "from sklearn.model_selection import train_test_split \n",
    "X_train, X_test, y_train, y_test = train_test_split(X_train_reshape, y_train_reshape, test_size=0.2, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# I used this packages for Hyperparametr tuning\n",
    "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV\n",
    "import xgboost\n",
    "xgb=xgboost.XGBClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# training the model on training set \n",
    "from sklearn.naive_bayes import GaussianNB \n",
    "gnb = GaussianNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_search=RandomizedSearchCV(model,param_distributions=params,n_iter=10,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_search.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_search.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from lightgbm import LGBMClassifier\n",
    "\n",
    "lgbm = LGBMClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neural_network import MLPClassifier\n",
    "\n",
    "#Initializing the MLPClassifier\n",
    "mlp = MLPClassifier(hidden_layer_sizes=(150,100,50), \n",
    "                           max_iter=300,activation = 'relu',solver='adam',random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SVM classifier\n",
    "from sklearn.svm import SVC\n",
    "svm=SVC()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Randomforest classifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "rfc = RandomForestClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "bag=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,bootstrap=True,n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostClassifier\n",
    "model = CatBoostClassifier(thread_count=2,\n",
    "                         loss_function='Logloss',\n",
    "                        \n",
    "                         od_type = 'Iter',\n",
    "                         verbose= False,\n",
    "                           bagging_temperature=1,\n",
    "                           l2_leaf_reg=2,\n",
    "                           depth=5,\n",
    "                           random_strength=5,\n",
    "                           learning_rate=0.088,\n",
    "                           scale_pos_weight=0.5,\n",
    "                           iterations=1000\n",
    "                           \n",
    "                        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "lg=LogisticRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 380,
   "metadata": {},
   "outputs": [],
   "source": [
    "model1=CatBoostClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 319,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier \n",
    "  \n",
    "knn = KNeighborsClassifier(n_neighbors = 10) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import VotingClassifier\n",
    "vclf=VotingClassifier(estimators=[('xg',xgb),('lg',lgb),('cat',model1)],voting='soft')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 748,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<catboost.core.CatBoostClassifier at 0x1e16cbba7c8>"
      ]
     },
     "execution_count": 748,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Training the model with several classifier algorithms catboost is giving high accuracy,\n",
    "# so i decided to choose catboost classifier\n",
    "model.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Python script for confusion matrix creation. \n",
    "from sklearn.metrics import confusion_matrix \n",
    "from sklearn.metrics import accuracy_score \n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 749,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Confusion Matrix :\n",
      "results\n",
      "Accuracy Score : 0.9608695652173913\n",
      "Report : \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.98      0.94      0.96       117\n",
      "           1       0.94      0.98      0.96       113\n",
      "\n",
      "    accuracy                           0.96       230\n",
      "   macro avg       0.96      0.96      0.96       230\n",
      "weighted avg       0.96      0.96      0.96       230\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Predicting on test data and checking the accuracy(precision and recall)\n",
    "y_pred = model.predict(X_test)\n",
    "print ('Confusion Matrix :')\n",
    "print('results') \n",
    "print( 'Accuracy Score :',accuracy_score(y_test, y_pred)) \n",
    "print( 'Report : ')\n",
    "print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 750,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predicting on given test Dataset\n",
    "predictions=model.predict(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 751,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>IsGoodNews</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   IsGoodNews\n",
       "0           1\n",
       "1           0\n",
       "2           0\n",
       "3           1\n",
       "4           0"
      ]
     },
     "execution_count": 751,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Creating the dataframe for results  \n",
    "result=pd.DataFrame(predictions,columns=[\"IsGoodNews\"])\n",
    "result.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 752,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0, 1], dtype=int64), array([328, 199], dtype=int64))"
      ]
     },
     "execution_count": 752,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Checking  labels  \n",
    "np.unique(result.IsGoodNews,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 753,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loading the dataframe into excel file to make submission\n",
    "result.to_excel('newcat12.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# I keep on tuning the parametres ,one of the paramteres gave best solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# I used kfold cross validations for checking accuracy (train accuracy and cv score),if both are closely same \n",
    "#then our modeling is performing good\n",
    "\n",
    "from sklearn import model_selection\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "seed = 8\n",
    "skf=StratifiedKFold(n_splits=5, random_state=None)\n",
    "from sklearn.model_selection import cross_val_score\n",
    "seed = 8\n",
    "kfold = model_selection.KFold(n_splits=8, random_state=seed)\n",
    "score=cross_val_score(model,X_train,y_train,cv=skf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
